import pandas as pd
import numpy as np
units = range(1, 500 + 1)
np.random.seed(42)
data = (
pd.DataFrame(
dict(
store=np.repeat(units, 4),
unit_fe=np.repeat(np.random.normal(0, 5, size=len(units)), 4),
weeks_to_xmas=np.tile([3, 2, 1, 0], len(units)),
avg_week_sales=np.repeat(np.random.gamma(20, 1, size=len(units)), 4).round(
2
),
)
)
.assign(
is_on_sale=lambda d: (
(
(d["unit_fe"] > 0) * np.random.binomial(1, 0.2, d.shape[0])
| (d["avg_week_sales"] > np.random.gamma(25, 1, size=len(units) * 4))
| (np.random.normal(d["weeks_to_xmas"], 2) > 2)
* np.random.binomial(1, 0.7, size=d.shape[0]) # xmas
)
* 1
),
)
.assign(
y0=lambda d: (
-10
+ 10 * d["unit_fe"]
+ d["avg_week_sales"]
+ 2 * d["avg_week_sales"] * d["weeks_to_xmas"]
)
)
.assign(y1=lambda d: d["y0"] + 50)
.assign(
tau=lambda d: d["y1"] - d["y0"],
weekly_amount_sold=lambda d: (
np.random.normal(np.where(d["is_on_sale"] == 1, d["y1"], d["y0"]), 10)
.clip(0, np.inf)
.round(2)
),
)
.drop(columns=["unit_fe", "y0", "y1", "tau"])
)
data.to_csv("../data/xmas_sales.csv", index=False)시뮬레이션 데이터
Chapter 1
import pandas as pd
import numpy as np
pd.set_option("display.max_rows", 5)
np.random.seed(123)
data = (
pd.read_csv("../data/online_classroom.csv")
.assign(
cross_sell_email=lambda d: np.select(
[d["format_ol"].astype(bool), d["format_blended"].astype(bool)],
["no_email", "long"],
default="short",
)
)
.assign(age=lambda d: np.random.gamma(1.5, 5, d.shape[0]).astype(int) + 14)
.assign(conversion=lambda d: ((d["falsexam"] - d["age"]) > 70).astype(int))
.drop(
columns=[
"asian",
"black",
"hawaiian",
"hispanic",
"unknown",
"white",
"format_ol",
"format_blended",
"falsexam",
]
)
)
data.to_csv("../data/cross_sell_email.csv", index=False)Chapter 4
np.random.seed(123)
data = (
pd.read_csv("../data/online_classroom.csv")
.assign(
recommender=lambda d: np.select(
[d["format_ol"].astype(bool), d["format_blended"].astype(bool)],
["benchmark", "benchmark"],
default="challenger",
)
)
.assign(age=lambda d: np.random.gamma(1.5, 5, d.shape[0]).astype(int) + 14)
.assign(tenure=lambda d: np.random.gamma(1, 1.1, d.shape[0]).astype(int).clip(0, 4))
.assign(
watch_time=lambda d: np.random.normal(
3 * (d["recommender"] == "challenger")
+ 10 * d["tenure"]
- 0.8 * d["age"]
+ 0.001 * d["age"] ** 2,
11,
)
)
.assign(
watch_time=lambda d: (
(d["watch_time"] - d["watch_time"].min())
/ (d["watch_time"].max() + 20 - d["watch_time"].min())
* 6
).round(2)
)
.drop(
columns=[
"asian",
"gender",
"black",
"hawaiian",
"hispanic",
"unknown",
"white",
"format_ol",
"format_blended",
"falsexam",
]
)
)
data.to_csv("../data/rec_ab_test.csv", index=False)np.random.seed(123)
risk_data = (
pd.read_csv("../data/wage.csv")
.sample(50000, replace=True)
.assign(wage=lambda d: np.random.normal(100 + d["wage"], 50).round(-1))
.assign(
credit_score1=lambda d: np.random.normal(
100 + 0.1 * d["wage"] + d["urban"] + d["educ"] + 2 * d["exper"], 50
).round(-1)
)
.assign(
credit_score2=lambda d: np.random.normal(
100 + 0.05 * d["wage"] + d["hours"] + d["age"] + 2 * d["IQ"], 50
).round(-1)
)
.assign(
credit_score1=lambda d: np.round(
1000
* (d["credit_score1"] - d["credit_score1"].min() + 20)
/ (d["credit_score1"].max() - d["credit_score1"].min()),
0,
)
)
.assign(
credit_score2=lambda d: np.round(
1000
* (d["credit_score2"] - d["credit_score2"].min() + 20)
/ (d["credit_score2"].max() - d["credit_score2"].min()),
0,
)
)
.assign(
credit_limit=lambda d: (
np.random.normal(
500
+ 1.5 * pd.IntervalIndex(pd.qcut(d["credit_score1"], 10)).mid
+ 1.5 * pd.IntervalIndex(pd.qcut(d["wage"], 10)).mid,
1000,
)
.round(-2)
.clip(200, np.inf)
)
)
.assign(
default=lambda d: (
np.random.normal(
+0
- 0.1 * d["credit_score1"]
- 0.5 * d["credit_score2"]
+ 0.05 * d["IQ"]
+ 0.005 * d["credit_limit"]
- 0.17 * d["wage"],
375,
)
> -50
).astype(int)
)
.drop(
columns=[
"IQ",
"lhwage",
"tenure",
"black",
"hours",
"sibs",
"south",
"urban",
"brthord",
"meduc",
"feduc",
"age",
]
)
.reset_index(drop=True)
)
spend_data = risk_data.assign(
spend=lambda d: np.random.normal(
50
+ 120 * np.power(d["credit_limit"], 1 / 2.5)
- d["credit_score1"]
+ 1.2 * d["wage"]
- 10 * d["exper"]
- 10 * d["educ"]
+ 400 * d["married"]
).astype(int)
).drop(columns=["default"])
risk_data.to_csv("../data/risk_data.csv", index=False)
spend_data.to_csv("../data/spend_data.csv", index=False)np.random.seed(123)
risk_data_rnd = (
pd.read_csv("../data/wage.csv")
.sample(10000, replace=True)
.assign(wage=lambda d: np.random.normal(100 + d["wage"], 50).round(-1))
.assign(
credit_score1=lambda d: np.random.normal(
100 + 0.1 * d["wage"] + d["urban"] + d["educ"] + 2 * d["exper"], 50
).round(-1)
)
.assign(
credit_score2=lambda d: np.random.normal(
100 + 0.05 * d["wage"] + d["hours"] + d["age"] + 2 * d["IQ"], 50
).round(-1)
)
.assign(
credit_score1=lambda d: np.round(
1000
* (d["credit_score1"] - d["credit_score1"].min() + 20)
/ (d["credit_score1"].max() - d["credit_score1"].min()),
0,
)
)
.assign(
credit_score2=lambda d: np.round(
1000
* (d["credit_score2"] - d["credit_score2"].min() + 20)
/ (d["credit_score2"].max() - d["credit_score2"].min()),
0,
)
)
.assign(
credit_score1_buckets=lambda d: (
(d["credit_score1"] / 200).round(0) * 200
).astype(int)
)
.assign(
credit_limit=lambda d: (
np.random.beta(1 + d["credit_score1_buckets"] / 100, 6) * 10000
).round(-2)
)
.assign(
default=lambda d: (
np.random.normal(
+0
- 0.1 * d["credit_score1"]
- 0.5 * d["credit_score2"]
+ 0.05 * d["IQ"]
+ 0.5 * d["sibs"]
- 0.5 * d["feduc"]
+ 0.005 * d["credit_limit"]
- 0.17 * d["wage"],
100,
)
> -300
).astype(int)
)
.drop(
columns=[
"IQ",
"lhwage",
"tenure",
"black",
"hours",
"sibs",
"south",
"urban",
"brthord",
"meduc",
"feduc",
"age",
]
)
.reset_index(drop=True)
)
risk_data_rnd.to_csv("../data/risk_data_rnd.csv", index=False)np.random.seed(123)
spend_data_rnd = (
pd.read_csv("../data/wage.csv")
.sample(500, replace=True)
.assign(wage=lambda d: np.random.normal(100 + d["wage"], 50).round(-1))
.assign(credit_score1=lambda d: np.random.normal(100, 50, len(d)).round(-1))
.assign(
credit_score1=lambda d: np.round(
1000
* (d["credit_score1"] - d["credit_score1"].min() + 20)
/ (d["credit_score1"].max() - d["credit_score1"].min()),
0,
)
)
.assign(
credit_score1_buckets=lambda d: (
(d["credit_score1"] / 200).round(0) * 200
).astype(int)
)
.assign(
credit_limit=lambda d: (
np.random.beta(1 + d["credit_score1_buckets"] / 100, 6) * 10000
).round(-2)
)
.assign(
spend=lambda d: np.random.normal(
500
+ 20 * np.power(d["credit_limit"], 1 / 2)
+ 1.2 * d["wage"]
- 10 * d["exper"]
- 10 * d["educ"]
+ 400 * d["married"],
600,
).astype(int)
)
.drop(
columns=[
"IQ",
"lhwage",
"tenure",
"black",
"hours",
"sibs",
"south",
"urban",
"brthord",
"meduc",
"feduc",
"age",
]
)
.reset_index(drop=True)
)
spend_data_rnd.to_csv("../data/spend_data_rnd.csv", index=False)Chapter 5
import numpy as np
import pandas as pd
np.random.seed(123)
df = (
pd.read_csv(
"https://raw.githubusercontent.com/matheusfacure/python-causality-handbook/master/causal-inference-for-the-brave-and-true/data/learning_mindset.csv"
)
.rename(
columns={
"schoolid": "departament_id",
"achievement_score": "engagement_score",
"success_expect": "tenure",
"school_achievement": "last_engagement_score",
"school_urbanicity": "role",
"school_poverty": "department_score",
"school_size": "department_size",
"ethnicity": "n_of_reports",
}
)
# reduce overlapp for better examples
.assign(
intervention=lambda d: (
d["intervention"].astype(bool)
| (np.random.normal(d["last_engagement_score"]) > 1.65)
).astype(int)
)
.assign(
intervention=lambda d: (
d["intervention"].astype(bool) | (np.random.normal(d["tenure"], 2) > 7)
).astype(int)
)
.assign(n_of_reports=lambda d: d["n_of_reports"].clip(0, 8))
.assign(
department_size=lambda d: d.groupby("departament_id")["n_of_reports"].transform(
sum
)
)
.assign(
last_engagement_score=lambda d: np.random.normal(d["last_engagement_score"])
)
.drop(columns=["frst_in_family", "school_mindset", "school_ethnic_minority"])
)
df.to_csv("../data/management_training.csv", index=False)/tmp/ipykernel_46876/2760081185.py:14: FutureWarning: The provided callable <built-in function sum> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
.assign(department_size = lambda d: d.groupby("departament_id")["n_of_reports"].transform(sum))
np.random.seed(123)
n = 10000
x1 = np.random.uniform(-1, 1, n)
x2 = np.random.uniform(-1, 1, n)
t = np.random.normal(7.5 - 1 * (x1 + x2), 2).clip(1, 14).round(1)
y = np.random.normal((23 - 0.8 * t - 8 * (x1 + x2))).clip(1, np.inf).round(0)
df_cont_t = pd.DataFrame(dict(ml_1=x1, ml_2=x2, interest=t, duration=y))
df_cont_t.to_csv("../data/interest_rate.csv", index=False)Chapter 6
import pandas as pd
from pandas.tseries import holiday
np.random.seed(123)
month_fe = {
1: 23,
2: 11,
3: 10,
4: 9,
5: 4,
6: 2,
7: 10,
8: 5,
9: 10,
10: 20,
11: 25,
12: 30,
}
week_fe = {0: 2, 1: 2, 2: 3, 3: 7, 4: 15, 5: 12, 6: 5}
store_fe = {0: 20, 1: 14, 2: 3, 3: 10, 4: 9, 5: 12, 6: 5}
day = pd.Series(pd.date_range("2016-01-01", "2019-01-01"))
weekend = day.dt.weekday >= 5
is_holiday = day.isin(
holiday.USFederalHolidayCalendar().holidays("2016-01-01", "2019-01-01")
)
is_dec = day.dt.month == 12
is_nov = day.dt.month == 11
time_data = pd.DataFrame(
dict(
day=day,
month=day.dt.month,
weekday=day.dt.weekday,
weekend=weekend,
is_holiday=is_holiday,
is_dec=day.dt.month == 12,
is_nov=day.dt.month == 11,
)
).assign(
month_fe=lambda d: d["month"].map(month_fe),
week_fe=lambda d: d["weekday"].map(week_fe),
)
data = (
pd.concat([time_data.assign(rest_id=i) for i in range(len(store_fe))])
.assign(store_fe=lambda d: d["rest_id"].map(store_fe))
.assign(
competitors_price=lambda d: (
np.random.beta(
5 + d["is_holiday"] + d["is_dec"] * 2 + d["is_nov"],
0.5 * d["store_fe"],
len(d),
)
* 9
+ 1
).round(2)
)
.assign(
sales0=lambda d: np.random.poisson(
d["month_fe"]
+ d["store_fe"]
+ d["week_fe"]
+ 3 * d["competitors_price"]
+ 10 * d["is_holiday"]
+ 5 * d["weekend"]
)
)
.assign(
effect=lambda d: np.random.poisson(
150
+ d["month_fe"]
- d["store_fe"]
+ d["week_fe"]
- 40 * np.sqrt(d["competitors_price"])
+ 5 * d["is_holiday"]
- 3 * d["weekend"]
)
)
.assign(
discounts=lambda d: (
((np.random.beta(1, 3, size=len(d)) * 50) / 5).astype(int) * 5
)
)
.assign(sales=lambda d: d["sales0"] + 0.5 * d["effect"] * d["discounts"])[
[
"rest_id",
"day",
"month",
"weekday",
"weekend",
"is_holiday",
"is_dec",
"is_nov",
"competitors_price",
"discounts",
"sales",
]
]
)
data.to_csv("../data/daily_restaurant_sales.csv", index=False)Chapter 7
np.random.seed(42)
categs = {
"vehicle": 0.1,
"food": 1,
"beverage": 1,
"art": 1,
"baby": 1,
"personal_care": 1,
"toys": 1,
"clothing": 2,
"decor": 1,
"cell_phones": 3,
"construction": 1,
"home_appliances": 1,
"electronics": 2,
"sports": 1,
"tools": 1,
"games": 2,
"industry": 1,
"pc": 2,
"jewel": 1,
"books": 1,
"music_books_movies": 1,
"health": 2,
}
n = 10000
age = (18 + np.random.beta(2, 7, n) * 90).round(0)
tenure = np.random.exponential(0.5, n).round(0)
ammount_spent = (
np.random.exponential(1, n) * 100
+ np.random.binomial(1, 0.01, n) * np.random.uniform(500, 50000, n)
).round(2)
categ_purchage = {cat: np.random.poisson(l, n) for cat, l in categs.items()}
X = pd.DataFrame(categ_purchage).assign(
age=age,
tenure=tenure,
ammount_spent=ammount_spent,
)
mkt_email_rnd = np.random.binomial(1, 0.5, n)
coefs = np.concatenate([np.random.uniform(-1, 1, len(categs)), np.array([1, 20, 0.4])])
y0 = np.random.exponential(X.values.dot(coefs))
cate_coef = np.concatenate(
[np.random.uniform(0, 100, len(categs)), np.array([-1, -20, 0])]
)
tau = np.random.normal(X.values.dot(cate_coef))
data_rnd = X.assign(mkt_email=mkt_email_rnd, next_mnth_pv=y0 + tau * mkt_email_rnd)[
["mkt_email", "next_mnth_pv", "age", "tenure", "ammount_spent"]
+ list(categs.keys())
].round(2)
data_rnd.to_csv("../data/email_rnd_data.csv", index=False)np.random.seed(2)
n = 300000
age = (18 + np.random.beta(2, 7, n) * 90).round(0)
tenure = np.random.exponential(0.5, n).round(0)
ammount_spent = (
np.random.exponential(1, n) * 100
+ np.random.binomial(1, 0.01, n) * np.random.uniform(500, 50000, n)
).round(2)
categ_purchage = {cat: np.random.poisson(l, n) for cat, l in categs.items()}
X = pd.DataFrame(categ_purchage).assign(
age=age,
tenure=tenure,
ammount_spent=ammount_spent,
)
coefs_ps = np.concatenate(
[np.random.uniform(-1, 1, len(categs)), np.array([-1, 40, 0.4])]
)
mkt_email_biased = (np.random.normal(X.values.dot(coefs_ps), 2000) > 0).astype(int)
y0 = np.random.exponential(X.values.dot(coefs))
tau = np.random.normal(X.values.dot(cate_coef))
data_biased = X.assign(
mkt_email=mkt_email_biased, next_mnth_pv=y0 + tau * mkt_email_biased
)[
["mkt_email", "next_mnth_pv", "age", "tenure", "ammount_spent"]
+ list(categs.keys())
].round(2)
data_biased.to_csv("../data/email_obs_data.csv", index=False)import pandas as pd
import numpy as np
from pandas.tseries import holiday
np.random.seed(123)
month_fe = {
1: 23,
2: 11,
3: 10,
4: 9,
5: 4,
6: 2,
7: 10,
8: 5,
9: 10,
10: 20,
11: 25,
12: 30,
}
week_fe = {0: 2, 1: 2, 2: 3, 3: 7, 4: 15, 5: 12, 6: 5}
store_fe = {0: 20, 1: 14, 2: 3, 3: 10, 4: 9, 5: 12, 6: 5}
day = pd.Series(pd.date_range("2016-01-01", "2019-01-01"))
weekend = day.dt.weekday >= 5
is_holiday = day.isin(
holiday.USFederalHolidayCalendar().holidays("2016-01-01", "2019-01-01")
)
is_dec = day.dt.month == 12
is_nov = day.dt.month == 11
time_data = pd.DataFrame(
dict(
day=day,
month=day.dt.month,
weekday=day.dt.weekday,
weekend=weekend,
is_holiday=is_holiday,
is_dec=day.dt.month == 12,
is_nov=day.dt.month == 11,
)
).assign(
month_fe=lambda d: d["month"].map(month_fe),
week_fe=lambda d: d["weekday"].map(week_fe),
)
data_cont = (
pd.concat([time_data.assign(rest_id=i) for i in range(len(store_fe))])
.assign(store_fe=lambda d: d["rest_id"].map(store_fe))
.assign(
competitors_price=lambda d: (
np.random.beta(
5 + d["is_holiday"] + d["is_dec"] * 2 + d["is_nov"],
0.5 * d["store_fe"],
len(d),
)
* 9
+ 1
).round(2)
)
.assign(
sales0=lambda d: np.random.poisson(
d["month_fe"]
+ d["store_fe"]
+ d["week_fe"]
+ 3 * d["competitors_price"]
+ 10 * d["is_holiday"]
+ 5 * d["weekend"]
)
)
.assign(
effect=lambda d: np.random.poisson(
150
+ d["month_fe"]
- d["store_fe"]
+ d["week_fe"]
- 40 * np.sqrt(d["competitors_price"])
+ 5 * d["is_holiday"]
- 3 * d["weekend"]
)
)
.assign(
discounts=lambda d: (
((np.random.beta(1, 3, size=len(d)) * 50) / 5).astype(int) * 5
)
)
.assign(sales=lambda d: d["sales0"] + 0.5 * d["effect"] * d["discounts"])[
[
"rest_id",
"day",
"month",
"weekday",
"weekend",
"is_holiday",
"is_dec",
"is_nov",
"competitors_price",
"discounts",
"sales",
]
]
)
data_cont.to_csv("../data/discount_data.csv", index=False)Chapter 8
import numpy as np
import pandas as pd
# 데이터 생성
date = pd.date_range("2021-05-01", "2021-07-31", freq="D")
cohorts = pd.to_datetime(["2021-05-15", "2021-06-04", "2021-06-20"])
poss_regions = ["S", "N", "W", "E"]
reg_ps = dict(zip(poss_regions, [0.3, 0.6, 0.7, 0.8]))
reg_fe = dict(zip(poss_regions, [20, 16, 8, 2]))
reg_trend = dict(zip(poss_regions, [0, 0.2, 0.4, 0.6]))
units = np.array(range(1, 200 + 1))
np.random.seed(123)
unit_reg = np.random.choice(poss_regions, len(units))
exp_trend = np.random.exponential(0.01, len(units))
treated_unit = np.random.binomial(1, np.vectorize(reg_ps.__getitem__)(unit_reg))
# staggered adoption dgp
df = (
pd.DataFrame(
dict(
date=np.tile(date, len(units)), # 수정: date.date 대신 date 사용
city=np.repeat(units, len(date)),
region=np.repeat(unit_reg, len(date)),
treated_unit=np.repeat(treated_unit, len(date)),
cohort=np.repeat(np.random.choice(cohorts.date, len(units)), len(date)),
eff_heter=np.repeat(np.random.exponential(1, size=len(units)), len(date)),
unit_fe=np.repeat(np.random.normal(0, 2, size=len(units)), len(date)),
time_fe=np.tile(np.random.normal(size=len(date)), len(units)),
week_day=np.tile(date.weekday, len(units)),
w_seas=np.tile(abs(5 - date.weekday) % 7, len(units)),
)
)
.assign(
reg_fe=lambda d: d["region"].map(reg_fe),
reg_trend=lambda d: d["region"].map(reg_trend),
reg_ps=lambda d: d["region"].map(reg_ps),
trend=lambda d: (
(pd.to_datetime(d["date"]) - pd.to_datetime(d["date"]).min()).dt.days
),
day=lambda d: (
(pd.to_datetime(d["date"]) - pd.to_datetime(d["date"]).min()).dt.days
),
cohort=lambda d: np.where(
d["treated_unit"] == 1, d["cohort"], pd.to_datetime("2100-01-01")
),
)
.assign(
treated=lambda d: (
(d["date"] >= d["cohort"]) & (d["treated_unit"] == 1)
).astype(int),
)
.assign(
y0=lambda d: np.round(
10
+ d["treated_unit"]
+ d["reg_trend"] * d["trend"] / 2
+ d["unit_fe"]
+ 0.4 * d["time_fe"]
+ 2 * d["reg_fe"]
+ d["w_seas"] / 5,
0,
),
)
.assign(
# y0 = lambda d: np.round(d["y0"] + d.groupby("city")["y0"].shift(1).fillna(0)*0.2, 0)
)
.assign(
y1=lambda d: (
d["y0"]
+ np.minimum(
0.2
* (
np.maximum(
0,
(
pd.to_datetime(d["date"]) - pd.to_datetime(d["cohort"])
).dt.days,
)
),
1,
)
* d["eff_heter"]
* 2
)
)
.assign(
tau=lambda d: d["y1"] - d["y0"],
downloads=lambda d: (
np.where(d["treated"] == 1, d["y1"], d["y0"])
+ np.random.normal(0, 0.7, len(d))
),
# date = lambda d: pd.to_datetime(d["date"]),
)
.round({"downloads": 0})
)
# 필터링 및 데이터 저장
reg_filter = ["N", "S", "E", "W"]
mkt_data_all = (
df.query("region.isin(@reg_filter)")
.query("date.astype('string') <= '2021-06-01'")
# .assign(cohort = lambda d: np.where(d["cohort"].astype(str) <= "2021-06-01", d["cohort"], "2050-01-01"))
.drop(
columns=[
"reg_fe",
"time_fe",
"cohort",
"w_seas",
"week_day",
"reg_trend",
"trend",
"day",
"unit_fe",
"y0",
"y1",
"eff_heter",
"reg_ps",
"treated_unit",
]
)
.assign(post=lambda d: (d["date"].astype(str) >= "2021-05-15").astype(int))
.assign(treated=lambda d: d.groupby("city")["treated"].transform("max"))
)
mkt_data = mkt_data_all.query("region=='S'")
mkt_data_cohorts = (
df.assign(post=lambda d: (d["date"] >= d["cohort"]).astype(int))
.assign(treated=lambda d: d.groupby("city")["treated"].transform("max"))
.drop(
columns=[
"reg_fe",
"time_fe",
"w_seas",
"week_day",
"reg_trend",
"trend",
"day",
"unit_fe",
"y0",
"y1",
"eff_heter",
"reg_ps",
"treated_unit",
]
)
)
mkt_data_cohorts.to_csv("../data/offline_mkt_staggered.csv", index=False)
mkt_data_all.to_csv("../data/short_offline_mkt_all_regions.csv", index=False)
mkt_data.to_csv("../data/short_offline_mkt_south.csv", index=False)Chapter 9
from statsmodels.tsa.arima_process import ArmaProcess
def simulate_series(
dates,
name,
pop,
pop_pct,
ws_w,
ms_w,
ys_w,
ar_w,
trend_w,
noise_w,
ar=1,
ma=7,
):
ws = abs(5 - dates.weekday) % 7
ms = abs(dates.day - 10)
ys = abs(6 - dates.month) % 12
arma = ArmaProcess(ar, ma).generate_sample(len(dates))
trend = np.arange(0, len(dates))
noise = np.random.normal(0, 1, len(dates))
comps = (ws, ms, ys, arma, trend, noise)
comp_noise = np.random.uniform(0.95, 1.05, len(comps))
coefs = (ws_w, ms_w, ys_w, ar_w, trend_w, noise_w)
result = (
sum(c * w * noise for c, w, noise in zip(comps, coefs, comp_noise))
* pop
* pop_pct
* 0.005
)
return pd.Series(result, name=name).clip(0, np.inf).round(0)br_cities = pd.read_csv("../data/br_cities.csv")
np.random.seed(3)
states = br_cities["state"].unique()
pop_pct = np.random.beta(5, 10, len(states)) * 0.1
trends = np.random.beta(9, 5, len(states)) * 0.5 - 0.25
week_sas = np.random.uniform(2, 4, len(states))
m_sas = np.random.uniform(0, 1, len(states))
y_sas = np.random.uniform(0, 2, len(states))
states_params = pd.DataFrame(
dict(
pop_pct=pop_pct,
state=states,
trends=trends,
week_sas=week_sas,
m_sas=m_sas,
y_sas=y_sas,
)
)
# states_params.head()br_cities = pd.read_csv("../data/br_cities.csv").merge(states_params, on="state")np.random.seed(12345)
date = pd.date_range("2022-03-01", "2022-06-30", freq="D")
series = [
simulate_series(
date,
name=city["city"],
pop=city["pop"] * city["pop_pct"],
pop_pct=city["pop_pct"],
ws_w=city["week_sas"],
ms_w=city["m_sas"],
ys_w=city["y_sas"],
ar_w=0.1,
trend_w=city["trends"],
noise_w=1000 / np.sqrt(city["pop"] * city["pop_pct"]),
)
.round(0)
.to_frame()
.assign(
population=city["pop"],
city=city["city"],
state=city["state"],
date=date,
)
.rename(columns={city["city"]: "app_download"})
for _, city in br_cities.sort_values("pop", ascending=False).head(50).iterrows()
]features = pd.concat(
[
pd.DataFrame(
dict(
activity=(
ArmaProcess([1, 0.9, 0.8, 0.7], 7).generate_sample(len(date))
+ row["hdi"]
).round(0)
* 3,
date=date,
state=row["state"],
city=row["city"],
)
)
for _, row in br_cities.sort_values("pop", ascending=False).head(50).iterrows()
]
)def simulate_effect(df, city_col, date_col, y_col, city, start_at, window, effect_pct):
window_mask = (df[date_col] >= start_at) & (df[city_col] == city)
def rbf(x, center, sigma):
return np.exp(-((x - center) ** 2) / sigma**2)
# 수정됨: window.2 대신 window / 2 사용
kernel = rbf(
(df[date_col] - pd.to_datetime(start_at)).dt.days - window / 2 - 1,
0,
window / 2,
)
y = np.where(window_mask, df[y_col] + df[y_col] * effect_pct * kernel, df[y_col])
return df.assign(**{y_col: y})df = pd.concat(series)
df = simulate_effect(
df, "city", "date", "app_download", "sao_paulo", "2022-05-01", 40, 0.4
)
df = simulate_effect(
df, "city", "date", "app_download", "joao_pessoa", "2022-05-01", 40, 0.5
)
df = simulate_effect(
df, "city", "date", "app_download", "porto_alegre", "2022-05-01", 40, 0.6
)
df = df.assign(
post=(df["date"] >= "2022-05-01") * 1,
treated=(df["city"].isin(["sao_paulo", "joao_pessoa", "porto_alegre"])) * 1,
)
df.to_csv("../data/online_mkt.csv", index=False)df_norm = df.assign(app_download_pct=100 * df["app_download"] / df["population"])
df_norm_cov = (
df_norm.merge(br_cities[["city", "state", "hdi"]])
.assign(
comp_download_pct=lambda d: (
100 * (0.8 * d["app_download"] + d["hdi"]) / d["population"]
)
)
.drop(columns=["hdi"])
)
df_norm_cov.to_csv("../data/online_mkt_cov.csv", index=False)Chapter 10
import numpy as np
import pandas as pd
from statsmodels.tsa.arima_process import ArmaProcess
T = 120
m = 2
p = 0.5
def y_given_d(d, effect_params=[3, 2, 1], T=T, seed=None):
np.random.seed(seed)
x = np.arange(1, T + 1)
return (
np.log(x + 1)
+ 2 * np.sin(x * 2 * np.pi / 24)
+ np.convolve(~d.astype(bool), effect_params)[: -(len(effect_params) - 1)]
+ np.random.normal(0, 1, T)
# + ArmaProcess([3,2,], 3).generate_sample(T)
).round(2)
# correct tau = 3+2+1=6
effect_params = [3, 2, 1]
np.random.seed(123)
df_sb_every = (
pd.DataFrame(
dict(
d=np.random.binomial(1, 0.5, T),
)
)
.assign(
delivery_time=lambda d: y_given_d(d["d"], seed=1),
delivery_time_1=lambda d: y_given_d(np.ones(T), seed=1),
delivery_time_0=lambda d: y_given_d(np.zeros(T), seed=1),
)
.assign(tau=lambda d: d["delivery_time_1"] - d["delivery_time_0"])
)
df_sb_every.to_csv("../data/sb_exp_every.csv", index=False)def gen_d(rand_points, p):
result = [np.random.binomial(1, p)]
for t in rand_points[1:]:
result.append(np.random.binomial(1, p) * t + (1 - t) * result[-1])
return np.array(result)
# 가상의 y_given_d 함수 (실제 구현에 따라 수정 필요)
def y_given_d(d, T, seed):
np.random.seed(seed)
return np.random.rand(T) # 예시로 랜덤 배열 반환
m = 2
T = 120
p = 0.5
n = T * m # 수정됨: T.m 대신 T * m 사용
rand_points_opt = np.isin(
np.arange(1, T + 1), [1] + [i * m + 1 for i in range(2, int(n) - 1)]
)
d_opt = gen_d(rand_points_opt, p)
y_opt = y_given_d(d_opt, T=T, seed=1)
pd.DataFrame(dict(rand_points=rand_points_opt, d=d_opt, delivery_time=y_opt)).to_csv(
"../data/sb_exp_opt.csv", index=False
)Chapter 11
n = 10000
# confounders
age = 18 + np.random.normal(24, 4, n).round(1)
income = 500 + np.random.gamma(1, age * 100, n).round(0)
credit_score = (np.random.beta(1, 3, n) * 1000).round(0)
u = np.random.uniform(-1, 1, n)
categs = ["never-taker", "complier"]
cutoff = 0.6
e = 1 / (1 + np.exp(-(u - 0.05 * age + 0.01 * credit_score)))
cust_categ = np.select([e <= cutoff, e > cutoff], categs)
# plt.hist(e)
# Instrument
prime_elegible = np.random.binomial(1, 0.5, n)
choose_prime = ((cust_categ == "complier") & (prime_elegible == 1)).astype(int)
# outcome
group_effect = np.select(
[cust_categ == "complier", cust_categ == "never-taker"], [700, 200]
)
pv_0 = np.random.normal(200 + 0.4 * income + 10 * age - 500 * u, 200).round(2)
pv_1 = pv_0 + group_effect
tau = pv_1 - pv_0
pv = pv_0 + group_effect * choose_prime
df = pd.DataFrame(
dict(
categ=cust_categ,
age=age,
income=income,
credit_score=credit_score,
prime_elegible=prime_elegible,
prime_card=choose_prime,
pv=pv,
tau=tau,
)
)[
[
"age",
"income",
"credit_score",
"prime_elegible",
"prime_card",
"pv",
"tau",
"categ",
]
]
df.to_csv("../data/prime_card.csv", index=False)